In [1]:
# Initialization Spark in Python
from pyspark import SparkContext
sc = SparkContext("local", "Load and save data")

Load and save data

Text

Text, JSON, CSV, SequenceFiles, Protocol Buffer

File System

NFS, HDFS, Amazon S3

Data Base

HBase, ElasticSearch, Cassandra, JDBS

Text


In [14]:
# input
input = sc.textFile("log.txt")

In [15]:
# data
result = sc.parallelize([1, 2, 3, 4])

In [18]:
# output
result.saveAsTextFile("output.txt")

JSON


In [46]:
import json

# input
input = sc.textFile("log.json")
data = input.map(lambda x: json.loads(x))

In [47]:
# output
result = data.filter(lambda x: x['code']).map(lambda x: json.dumps(x))
result.saveAsTextFile("output.json")